In [3]:
import pandas as pd
import numpy as np
#loading data set
df = pd.read_csv("cardio_train.csv", sep = ",")
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [4]:
import pandas as pd
import numpy as np
#loading data set
df = pd.read_csv("cardio_train.csv", sep = ";")
In [5]:
# Coverting days to years
df["age_years"] = (df["age"]/ 365). round().astype(int)
In [6]:
# finding BMI - Weight or heigh alon not predictive, but heigh might be predictive of blood pressure
df["BMI"] = df["weight"] / (df["height"] * df["height"] / 10000)
In [7]:
#data cleaning BMI, HEIGHT, BP ETC
# Remove unrealistic blood pressure values
df.drop(df[(df['ap_hi'] > 250) |
(df['ap_hi'] < 50)].index, inplace=True)
df.drop(df[(df['ap_lo'] > 140) |
(df['ap_lo'] < 30)].index, inplace=True)
df.drop(df[(df['ap_hi'] < df['ap_lo']) |
(df['ap_lo'] > df['ap_hi'].quantile(0.025))].index, inplace=True)
In [8]:
#Mean arterial pressure
df["MAP"] = (df["ap_hi"]) /3 + 2 * (df["ap_lo"]) / 3
In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
# Calculate the correlation matrix on your cleaned and engineered data
# (Make sure to include 'age_years', 'bmi', and 'cardio')
corr_matrix = df[['age_years', 'BMI', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'MAP', 'cardio', ]].corr()
# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='viridis', fmt='.2f')
plt.title('Correlation Heatmap of CVD Factors')
plt.show()
In [10]:
import seaborn as sns
import matplotlib.pyplot as plt
# Create a figure and axes
fig, ax = plt.subplots(figsize=(12, 8))
# Create the violin plot
# 'x' is your categorical feature (cholesterol)
# 'y' is your continuous feature (systolic blood pressure)
# 'hue' splits the violins by your target (cardio)
# 'split=True' creates the "half-and-half" violin for a direct comparison
sns.violinplot(data=df, x='cholesterol', y='MAP', hue='cardio',
split=True, inner='quartile', palette='Set1', ax=ax)
plt.title('Systolic Blood Pressure Distribution by Cholesterol and CVD Status')
plt.xlabel('Cholesterol Level (1: norm, 2: above, 3: well above)')
plt.ylabel('Mean Arterial Blood Pressure (MAP)')
plt.legend(title='Cardio', loc='upper left', labels=['No CVD', 'CVD'])
plt.show()
In [12]:
import plotly.express as px
fig = px.scatter_3d(df, x='age_years', y='MAP', z='BMI',
color='cardio')
fig.show()
In [13]:
# x and y given as array_like objects
import plotly.express as px
fig = px.scatter(data_frame=df, x="MAP", y="age", color = "cardio")
fig.show()